import tkinter as tk
from tkinter import ttk, messagebox
from tkinterdnd2 import DND_FILES, TkinterDnD
import os
import pdfplumber
import re
import concurrent.futures # For parallel processing

# --- Configuration ---
MIN_CHARS_FOR_NON_SCANNED_PDF_PER_PAGE_AVG = 10

# --- PDF Processing Logic (remains largely the same, called by worker) ---

def preprocess_text(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
    text = re.sub(r"-\n", "", text)
    paragraphs = re.split(r'\n\s*\n', text)
    processed_paragraphs = []
    for para_block in paragraphs:
        para_block = para_block.strip()
        if not para_block:
            continue
        lines_joined = para_block.replace('\n', ' ')
        consolidated_text = re.sub(r'\s+', ' ', lines_joined).strip()
        if consolidated_text:
            processed_paragraphs.append(consolidated_text)
    return "\n\n".join(processed_paragraphs)

def extract_text_from_pdf(pdf_path: str) -> tuple[str, bool]:
    all_text_parts = []
    total_chars = 0
    page_count = 0
    is_likely_scanned = True
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if not pdf.pages:
                return "", True
            page_count = len(pdf.pages)
            for page in pdf.pages:
                page_text = page.extract_text(x_tolerance=1, y_tolerance=3, use_text_flow=True, keep_blank_chars=False)
                if page_text:
                    page_text = page_text.strip()
                    all_text_parts.append(page_text)
                    total_chars += len(page_text)
            if page_count > 0 and (total_chars / page_count) >= MIN_CHARS_FOR_NON_SCANNED_PDF_PER_PAGE_AVG:
                is_likely_scanned = False
            elif total_chars > 0 :
                is_likely_scanned = False
        full_extracted_text = "\n\n".join(filter(None, all_text_parts))
        return full_extracted_text, is_likely_scanned
    except Exception as e:
        print(f"Error reading PDF '{os.path.basename(pdf_path)}': {e}")
        return "", True

# --- Worker Function for Parallel Processing ---
# This function must be at the top level for ProcessPoolExecutor
def worker_process_pdf(pdf_path: str) -> dict:
    """
    Processes a single PDF file. Designed to be run in a separate process.
    Returns a dictionary with status and results.
    """
    base_name = os.path.basename(pdf_path)
    output_directory = os.path.dirname(pdf_path)
    print(f"Worker starting for: {base_name}")

    try:
        if not os.path.isfile(pdf_path):
            return {"status": "error_file_not_found", "pdf_path": pdf_path, "message": f"File not found: {base_name}"}

        # Calls the existing extraction and preprocessing functions
        extracted_text, is_scanned = extract_text_from_pdf(pdf_path)

        if is_scanned:
            return {"status": "skipped_scanned", "pdf_path": pdf_path, "message": f"Skipped scanned/empty: {base_name}"}

        if not extracted_text.strip():
            return {"status": "skipped_no_text", "pdf_path": pdf_path, "message": f"No text extracted: {base_name}"}

        processed_text = preprocess_text(extracted_text)

        if not processed_text.strip():
            return {"status": "skipped_empty_after_preprocessing", "pdf_path": pdf_path, "message": f"Empty after preprocessing: {base_name}"}

        txt_filename_base = os.path.splitext(base_name)[0]
        txt_filepath = os.path.join(output_directory, f"{txt_filename_base}.txt")

        with open(txt_filepath, "w", encoding="utf-8") as f_out:
            f_out.write(processed_text)
        return {"status": "success", "pdf_path": pdf_path, "txt_filepath": txt_filepath, "message": f"Converted: {base_name}"}

    except IOError as e: # Specific to file writing
        return {"status": "error_write", "pdf_path": pdf_path, "message": f"IOError writing TXT for {base_name}: {e}"}
    except Exception as e: # Catch any other exceptions from pdfplumber or preprocessing
        print(f"Unhandled exception in worker for {base_name}: {e}") # Log this for debugging
        return {"status": "error_processing", "pdf_path": pdf_path, "message": f"Error processing {base_name}: {e}"}


# --- GUI Application Class ---
class PDFConverterApp:
    def __init__(self, root_window: TkinterDnD.Tk):
        self.root = root_window
        self.root.title("PDF to TXT Converter (Optimized)")
        self.root.geometry("700x550")
        self.file_paths = []
        self._setup_styles()
        self._setup_ui()

        # For tracking conversion progress
        self.successful_conversions = 0
        self.skipped_files = 0
        self.error_files = 0
        self.processed_file_count = 0
        self.total_files_to_process = 0


    def _setup_styles(self):
        style = ttk.Style()
        style.theme_use('clam')
        style.configure("TLabel", padding=5, font=("Helvetica", 10))
        style.configure("TButton", padding=6, font=("Helvetica", 10, "bold"))
        style.configure("Accent.TButton", foreground="white", background="#0078D7", font=("Helvetica", 11, "bold"), padding=8)
        style.map("Accent.TButton", background=[('active', '#005A9E')])
        style.configure("Listbox.TFrame", background="white")
        style.configure("Status.TLabel", padding=5, relief=tk.SUNKEN, anchor=tk.W, font=("Helvetica", 9))

    def _setup_ui(self):
        main_frame = ttk.Frame(self.root, padding="10")
        main_frame.pack(fill=tk.BOTH, expand=True)

        drop_instruction_label = ttk.Label(
            main_frame, text="Drag & Drop PDF Files Here or onto the List Below",
            font=("Helvetica", 12, "italic"), relief=tk.SOLID, padding=20, anchor=tk.CENTER
        )
        drop_instruction_label.pack(fill=tk.X, pady=(0, 10))
        drop_instruction_label.drop_target_register(DND_FILES)
        drop_instruction_label.dnd_bind('<<Drop>>', self._handle_drop_event)

        list_container_frame = ttk.Frame(main_frame, style="Listbox.TFrame", relief=tk.GROOVE, borderwidth=2)
        list_container_frame.pack(fill=tk.BOTH, expand=True, pady=5)

        self.file_listbox = tk.Listbox(
            list_container_frame, selectmode=tk.EXTENDED, width=80, height=15,
            bg="white", fg="black", font=("Consolas", 10)
        )
        self.file_listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5, pady=5)
        list_scrollbar = ttk.Scrollbar(list_container_frame, orient=tk.VERTICAL, command=self.file_listbox.yview)
        list_scrollbar.pack(side=tk.RIGHT, fill=tk.Y, padx=(0,5), pady=5)
        self.file_listbox.config(yscrollcommand=list_scrollbar.set)
        self.file_listbox.drop_target_register(DND_FILES)
        self.file_listbox.dnd_bind('<<Drop>>', self._handle_drop_event)

        button_frame = ttk.Frame(main_frame)
        button_frame.pack(fill=tk.X, pady=(10, 0))
        self.convert_button = ttk.Button(
            button_frame, text="Convert Selected to TXT",
            command=self._start_conversion_process, style="Accent.TButton"
        )
        self.convert_button.pack(side=tk.LEFT, padx=(0,10), expand=True, fill=tk.X)
        clear_list_button = ttk.Button(button_frame, text="Clear List", command=self._clear_file_list)
        clear_list_button.pack(side=tk.RIGHT, padx=(10,0))

        self.status_var = tk.StringVar()
        self.status_var.set("Ready. Drag PDF files to add them to the list.")
        status_bar = ttk.Label(self.root, textvariable=self.status_var, style="Status.TLabel")
        status_bar.pack(side=tk.BOTTOM, fill=tk.X)

    def _handle_drop_event(self, event):
        try:
            dropped_files_raw = self.root.tk.splitlist(event.data)
            if not dropped_files_raw: return
            added_count = 0
            for f_path_raw in dropped_files_raw:
                abs_path = os.path.abspath(os.path.normpath(f_path_raw))
                if abs_path.lower().endswith(".pdf") and os.path.isfile(abs_path):
                    if abs_path not in self.file_paths:
                        self.file_paths.append(abs_path)
                        self.file_listbox.insert(tk.END, os.path.basename(abs_path) + f"  ({os.path.dirname(abs_path)})")
                        added_count += 1
                    else: print(f"Info: File '{os.path.basename(abs_path)}' already in list.")
                else: print(f"Warning: Ignoring non-PDF or invalid file: '{f_path_raw}'")
            if added_count > 0: self.status_var.set(f"Added {added_count} PDF file(s). Total: {len(self.file_paths)}.")
            elif dropped_files_raw: self.status_var.set("No new valid PDF files were added from the drop.")
        except Exception as e:
            messagebox.showerror("Drop Error", f"Failed to process dropped files: {e}")
            print(f"Critical Drop Error: {e}, Raw Data: '{event.data}'")

    def _clear_file_list(self):
        self.file_paths.clear()
        self.file_listbox.delete(0, tk.END)
        self.status_var.set("File list cleared. Ready for new PDF files.")
        print("File list cleared.")

    def _update_ui_after_task(self, result: dict):
        """Safely updates UI from the main thread based on task result."""
        status = result.get("status", "unknown_status")
        message = result.get("message", "Unknown outcome")
        pdf_path = result.get("pdf_path")
        base_name = os.path.basename(pdf_path) if pdf_path else "Unknown file"

        print(f"Task for '{base_name}': {status} - {message}") # Log to console

        if status == "success":
            self.successful_conversions += 1
        elif status.startswith("skipped"):
            self.skipped_files += 1
        elif status.startswith("error"):
            self.error_files += 1
            # Optionally show a non-blocking error indication or log more visibly
        else: # Should not happen if result dict is well-formed
             print(f"Warning: Unknown status '{status}' for {base_name}")
             self.error_files += 1


        self.processed_file_count += 1
        self.status_var.set(
            f"Processed {self.processed_file_count}/{self.total_files_to_process}. "
            f"Success: {self.successful_conversions}, Skipped: {self.skipped_files}, Errors: {self.error_files}. "
            f"Last: {base_name} ({status})"
        )
        # self.root.update_idletasks() # Not strictly needed here as root.after handles event loop

        if self.processed_file_count == self.total_files_to_process:
            self._finalize_conversion()

    def _finalize_conversion(self):
        """Called when all tasks are completed."""
        self.convert_button.config(state=tk.NORMAL)
        summary_message = (
            f"Conversion Complete! "
            f"Successful: {self.successful_conversions}, "
            f"Skipped: {self.skipped_files}, "
            f"Errors: {self.error_files}."
        )
        self.status_var.set(summary_message)
        messagebox.showinfo("Conversion Finished", summary_message)
        print(summary_message)


    def _start_conversion_process(self):
        if not self.file_paths:
            messagebox.showinfo("No Files", "Please add PDF files to the list before converting.")
            return

        self.convert_button.config(state=tk.DISABLED)

        # Reset counters for this batch
        self.successful_conversions = 0
        self.skipped_files = 0
        self.error_files = 0
        self.processed_file_count = 0
        self.total_files_to_process = len(self.file_paths)

        paths_to_process = list(self.file_paths) # Process a copy

        # Determine number of workers (e.g., number of CPU cores)
        # os.cpu_count() might return None, so handle that.
        num_workers = os.cpu_count()
        if num_workers is None or num_workers < 1:
            num_workers = 1
        # You might want to cap num_workers, e.g., max(1, os.cpu_count() - 1) to leave a core free
        # For pure batch processing, using all cores is often fine.
        print(f"Starting conversion with up to {num_workers} worker processes.")
        self.status_var.set(f"Initializing conversion for {self.total_files_to_process} files with {num_workers} workers...")
        self.root.update_idletasks() # Ensure this message is shown


        with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
            # Submit all tasks to the executor
            future_to_pdf = {executor.submit(worker_process_pdf, pdf_path): pdf_path for pdf_path in paths_to_process}

            for future in concurrent.futures.as_completed(future_to_pdf):
                # pdf_path_original = future_to_pdf[future] # To know which future this was, if needed
                try:
                    result = future.result()  # Blocks until this future is done, then gets the return value of worker_process_pdf
                    # Schedule UI update on the main thread using root.after
                    self.root.after(0, self._update_ui_after_task, result)
                except Exception as exc:
                    # This catches exceptions if future.result() itself fails badly,
                    # or if worker_process_pdf raised an exception not caught and returned in its dict.
                    pdf_path_for_error = future_to_pdf[future]
                    base_name_for_error = os.path.basename(pdf_path_for_error)
                    print(f"Critical error processing future for {base_name_for_error}: {exc}")
                    error_result = {
                        "status": "error_critical_future",
                        "pdf_path": pdf_path_for_error,
                        "message": f"Unexpected framework exception for {base_name_for_error}: {exc}"
                    }
                    self.root.after(0, self._update_ui_after_task, error_result)
        
        # If total_files_to_process was 0 (though guarded earlier), or if somehow no futures were processed,
        # ensure the UI is in a consistent state.
        if self.total_files_to_process > 0 and self.processed_file_count == 0:
            print("Warning: No files seem to have been processed by workers. Finalizing state.")
            self._finalize_conversion() # Ensure UI is re-enabled, especially if no tasks ran to completion.
        elif self.total_files_to_process == 0 : # Should be caught by the initial check.
             self.convert_button.config(state=tk.NORMAL)


# --- Main Execution ---
def main():
    root = TkinterDnD.Tk()
    app = PDFConverterApp(root)
    root.mainloop()

if __name__ == "__main__":
    # This check is crucial for multiprocessing on Windows and other platforms
    # to prevent new processes from re-executing the main script's setup code.
    main()